Address the data science problem
col_names <- names(read_csv(
"data/kaggle_survey_2021_responses.csv",
n_max=0))
dat <- read_csv(
"data/kaggle_survey_2021_responses.csv",
col_names = col_names, skip=2)
dat <- dat %>%
filter(Q3=="United States of America" )
job.dat <- dat %>%
filter(Q5 %in% c("Data Analyst",
"Data Engineer",
"Data Scientist",
"Machine Learning Engineer",
"Software Engineer",
"Statistician",
"Student"))Since the one large data science problem can be divided into several smaller bits, we decide to combine the analysis and the interpretation sections together. And the questions will be addressed and discussed one by one.
The following questions are about data science career paths.
What percentage of the survey respondents are working under these job titles?
jtitle <- sort(table(dat$Q5), decreasing = T) %>%
as.data.frame() %>%
as.tibble()## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
jtitle <- rename(jtitle, `Job Title` = Var1)
ggplot(jtitle, aes(x="", y=Freq, fill=`Job Title`)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() What are the statistics on salaries for these job titles?
The US minimum wage is 7.25 per hour, multiply that for a total \(260\) typical number of workdays a year:
\[7.25 \frac{\$}{\text{hour}} \cdot 24 \frac{\text{hours}}{\text{day}} \cdot 260\frac{\text{days}}{\text{year}} = 45240.\]
The new categories* for salaries will be: - poverty: below the federal minimum wage - low: 40,000 to 79,999 - medium: 80,000 to 124,999 - high: 125,000 to 199,999 - very high: 200,000 to 499,999 - highest: >= 500,000
*loosely based on US federal income tax brackets.
# To change salary categories into FACTOR dtype with descending labels
poverty <- c("$0-999", "1,000-1,999" , "2,000-2,999", "3,000-3,999", "4,000-4,999", "5,000-7,499", "7,500-9,999",
"10,000-14,999", "15,000-19,999", "20,000-24,999", "25,000-29,999", "30,000-39,999")
low <- c("40,000-49,999", "50,000-59,999", "60,000-69,999", "70,000-79,999")
medium <- c("80,000-89,999", "90,000-99,999", "100,000-124,999")
high <- c("125,000-149,999", "150,000-199,999")
very_high <- c("200,000-249,999", "250,000-299,999", "300,000-499,999")
highest <- c("$500,000-999,999", ">$1,000,000")
dat$Q25[dat$Q25 %in% poverty] <- "poverty"
dat$Q25[dat$Q25 %in% low] <- "low"
dat$Q25[dat$Q25 %in% medium] <- "medium"
dat$Q25[dat$Q25 %in% high] <- "high"
dat$Q25[dat$Q25 %in% very_high] <- "very high"
dat$Q25[dat$Q25 %in% highest] <- "highest"
dat$Q25 <- factor(dat$Q25, levels = c("poverty", "low", "medium", "high", "very high", "highest"), ordered = T)
data_side <- c("Data Scientist", "Data Analyst", "Business Analyst", "Data Engineer", "Statistician", "DBA/Database Engineer")
swe_side <- c("Software Engineer", "Machine Learning Engineer", "Program/Project Manager", "Product Manager")
academia <- c("Student", "Other", "Research Scientist")
dat[dat$Q5 %in% data_side & !is.na(dat$Q5) & !is.na(dat$Q25), ] %>%
ggplot( aes(x=Q5, y=Q25, color = Q5)) +
geom_count() +
ggtitle("Two-Way Salary Visualizations: Data-Oriented Jobs") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): Windows字体数据
## 库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): Windows字体数据
## 库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): Windows字体数据
## 库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): Windows字体数据
## 库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
dat[dat$Q5 %in% swe_side & !is.na(dat$Q5) & !is.na(dat$Q25), ] %>%
ggplot( aes(x=Q5, y=Q25, color = Q5)) +
geom_count() +
ggtitle("Two-Way Salary Visualizations: Engineering-Oriented Jobs") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
dat[dat$Q5 %in% academia & !is.na(dat$Q5) & !is.na(dat$Q25), ] %>%
ggplot( aes(x=Q5, y=Q25, color = Q5)) +
geom_count() +
ggtitle("Two-Way Salary Visualizations: Academic Jobs and Others") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
What levels of education are required for these job titles?
Is there a significant income gap between genders for these jobs?
What is the typical skill set for these jobs? How does it affect the pay rate?
skill.set <- job.dat %>%
select(c(Q5, starts_with("Q7_"), starts_with("Q9_"),
starts_with("Q12_"), starts_with("Q14_"),
starts_with("Q16_"), starts_with("Q17_"),
starts_with("Q18_"), starts_with("Q19_"))) %>%
mutate(Total = "TotalHelper") %>%
gather("fake_key", "skillset", -Q5, na.rm = T) %>%
filter(!skillset %in% c("None", "Other")) %>%
rename(title = Q5) %>%
count(title, skillset) %>%
group_by(title) %>%
mutate(prop = round(n / max(n), 3)) %>%
filter(prop >= 0.1 & skillset != "TotalHelper") %>%
select(-n) %>%
arrange(title, desc(prop))
datatable(skill.set, filter = 'top', width = 600)Is there a certain correlation between industry and the need for these jobs?
industry.dat <- job.dat %>%
filter(Q5 != "Student") %>%
select(Q5, Q20, Q25) %>%
filter(Q20 %in% c("Academics/Education",
"Accounting/Finance",
"Computers/Technology",
"Insurance/Risk Assessment",
"Medical/Pharmaceutical",
"Online Service/Internet-based Services")) %>%
mutate(Q25 = str_remove_all(Q25, "[$,]")) %>%
mutate(Q25 = str_replace(Q25, ">1000000", "1000000-2000000")) %>%
separate(Q25, into = c("salary_lb", "salary_ub"), sep = "-") %>%
mutate(salary_lb = as.numeric(salary_lb)) %>%
mutate(salary_ub = as.numeric(salary_ub))
p <- industry.dat %>%
count(Q5, Q20) %>%
mutate(Q20 = fct_reorder(Q20, n, .fun="sum")) %>%
rename(title=Q5, Industry=Q20, count=n) %>%
ggplot(aes(x=title, y=count)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ Industry) +
labs(
title = "Users' work industry",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p)chisq.test(table(industry.dat$Q5, industry.dat$Q20))## Warning in chisq.test(table(industry.dat$Q5, industry.dat$Q20)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(industry.dat$Q5, industry.dat$Q20)
## X-squared = 108.6, df = 25, p-value = 2.153e-12
industry.dat %>%
mutate(Q20 = fct_reorder(Q20, salary_lb, .fun='length')) %>%
ggplot(aes(x=Q20, y=salary_lb)) +
geom_boxplot() +
coord_flip() +
facet_wrap(~ Q5) +
labs(
title = "Users' salary vs industry",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())## Warning: Removed 35 rows containing non-finite values (stat_boxplot).
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): Windows字体数据
## 库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
The following questions are majorly about data science skills, tools and technologies.
What programming languages and IDEs do they use?
Survey questions Q7 (daily-used programming language), Q9 (IDE).
programming <- job.dat %>%
select(c(Q5, starts_with("Q7_"))) %>%
gather("fake_key", "language", -Q5, na.rm = T) %>%
rename(title = Q5) %>%
select(-fake_key) %>%
filter(!language %in% c("None", "Other")) %>%
count(title, language, .drop = FALSE) %>%
complete(title, language) %>%
replace_na(list(n = 0)) %>%
group_by(title) %>%
mutate(prop = prop.table(n))
p <- programming %>%
mutate(text = paste0("Language: ", language, "\n",
"Job title: ", title, "\n",
"Count: ", n, "\n",
"Proportion: ", round(prop, 3))) %>%
ggplot(aes(language, title, fill=prop, text=text)) +
geom_tile() +
scale_fill_gradient(low="white", high="blue") +
labs(
title = "Users' favorite programming language",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p, tooltip="text")## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
ide <- job.dat %>%
select(c(Q5, starts_with("Q9_"))) %>%
gather("fake_key", "IDE", -Q5, na.rm = T) %>%
rename(title = Q5) %>%
select(-fake_key) %>%
mutate(IDE = case_when(
IDE == "Visual Studio Code (VSCode)" ~ "VSCode",
IDE == "Jupyter (JupyterLab, Jupyter Notebooks, etc)" ~ "Jupyter Notebook",
TRUE ~ IDE
)) %>%
filter(!IDE %in% c("None", "Other")) %>%
count(title, IDE, .drop = FALSE) %>%
complete(title, IDE) %>%
replace_na(list(n = 0)) %>%
group_by(title) %>%
mutate(prop = prop.table(n))
p <- ide %>%
mutate(text = paste0("IDE: ", IDE, "\n",
"Job title: ", title, "\n",
"Count: ", n, "\n",
"Proportion: ", round(prop, 3))) %>%
ggplot(aes(IDE, title, fill=prop, text=text)) +
geom_tile() +
scale_fill_gradient(low="white", high="blue") +
labs(
title = "Users' favorite IDE",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p, tooltip="text")## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
Where do they get and share the knowledge?
Survey questions Q39 (share and deploy), Q40 (learning resources), Q42 (Media sources).
learning_platform <- job.dat %>%
select(c(Q5, starts_with("Q40_"))) %>%
gather("fake_key", "learning", -Q5, na.rm = T) %>%
rename(title = Q5) %>%
select(-fake_key) %>%
mutate(learning = case_when(
learning == "Cloud-certification programs (direct from AWS, Azure, GCP, or similar)" ~ "Cloud-certif Programs",
learning == "University Courses (resulting in a university degree)" ~ "University",
TRUE ~ learning
)) %>%
filter(!learning %in% c("None", "Other")) %>%
count(title, learning, .drop = FALSE) %>%
complete(title, learning) %>%
replace_na(list(n = 0)) %>%
group_by(title) %>%
mutate(prop = prop.table(n))
p <- learning_platform %>%
mutate(text = paste0("Platform: ", learning, "\n",
"Job title: ", title, "\n",
"Count: ", n, "\n",
"Proportion: ", round(prop, 3))) %>%
ggplot(aes(learning, title, fill=prop, text=text)) +
geom_tile() +
scale_fill_gradient(low="white", high="blue") +
labs(
title = "Users' favorite learning platforms",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p, tooltip="text")## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
share_deploy <- job.dat %>%
select(c(Q5, starts_with("Q39_"))) %>%
gather("fake_key", "share", -Q5, na.rm = T) %>%
rename(title = Q5) %>%
select(-fake_key) %>%
mutate(share = case_when(
share == "I do not share my work publicly" ~ "\'PRIVATE\'",
TRUE ~ share
)) %>%
filter(!share %in% c("Other")) %>%
count(title, share, .drop = FALSE) %>%
complete(title, share) %>%
replace_na(list(n = 0)) %>%
group_by(title) %>%
mutate(prop = prop.table(n))
p <- share_deploy %>%
mutate(text = paste0("Platform: ", share, "\n",
"Job title: ", title, "\n",
"Count: ", n, "\n",
"Proportion: ", round(prop, 3))) %>%
ggplot(aes(share, title, fill=prop, text=text)) +
geom_tile() +
scale_fill_gradient(low="white", high="blue") +
labs(
title = "Users' favorite share platforms",
x = "",
y = "",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p, tooltip="text")## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
media_source <- job.dat %>%
select(c(Q5, starts_with("Q42_"))) %>%
gather("fake_key", "media", -Q5, na.rm = T) %>%
rename(title = Q5) %>%
select(-fake_key) %>%
filter(!media %in% c("None", "Other")) %>%
count(title, media, .drop = FALSE) %>%
complete(title, media) %>%
replace_na(list(n = 0)) %>%
group_by(title) %>%
mutate(prop = prop.table(n)) %>%
separate(media, into = c("media", "media_suffix"), sep = " \\(")
p <- media_source %>%
mutate(text = paste0("Platform: ", media, "\n",
"Job title: ", title, "\n",
"Count: ", n, "\n",
"Proportion: ", round(prop, 3))) %>%
ggplot(aes(media, title, fill=prop, text=text)) +
geom_tile() +
scale_fill_gradient(low="white", high="blue") +
labs(
title = "Users' favorite media source",
caption = glue("Author: celeritasML
Source: Kaggle")) +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p, tooltip="text")## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
What data science packages do they use?
Survey questions Q14 (visualization libraries), Q16 (machine learning related libraries).
viz_lib <- dat %>%
select(Q5, starts_with("Q14")) %>%
select(-c(Q14_Part_11, Q14_OTHER))
viz_lib <- viz_lib %>%
pivot_longer(cols=starts_with("Q14")) %>%
select(-name) %>%
drop_na() %>%
filter(Q5!="Other")
ggplot(viz_lib) +
geom_bar(aes(y = value, fill = value)) +
facet_wrap(~ Q5) +
scale_fill_brewer(palette = "Spectral") +
labs(
title = "Data science practitioners' favorite visualization libraries",
x = "",
y = "",
caption = glue("Author: celeritasML
Source: Kaggle")
) +
theme(
axis.ticks.x = element_line(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 6),
axis.text.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): Windows字体数据
## 库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
ml_lib <- dat %>%
select(Q5, starts_with("Q16")) %>%
select(-c(Q16_Part_17, Q16_OTHER))
ml_lib <- ml_lib %>%
pivot_longer(cols=starts_with("Q16")) %>%
select(-name) %>%
drop_na() %>%
filter(Q5!="Other")
top_10_ml <- ml_lib %>%
group_by(value) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
top_n(10)## Selecting by count
ml_lib <- ml_lib %>%
filter(value %in% top_10_ml$value)
ggplot(ml_lib) +
geom_bar(aes(y = value, fill = value)) +
facet_wrap(~ Q5) +
scale_fill_brewer(palette = "Spectral") +
labs(
title = "Data science practitioners' favorite ML libraries",
x = "",
y = "",
caption = glue("Author: celeritasML
Source: Kaggle")
) +
theme(
axis.ticks.x = element_line(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 6),
axis.text.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
Will a user’s preference for cloud computing platforms affect his or her preference for other tools? For example, we want to know if an AWS EC2 dedicated user will actually prefer AWS S3 over other products.
Survey question Q29-A: computing products (Part_1) Survey question Q30: Storage (Part_3, Part_4) Survey question Q31-A: ML products (Part_1)
\[\chi^2=\sum\frac{(O_i-E_i)^2}{E_i}\]
aws_user <- tibble(
ec2 = dat$Q29_A_Part_1,
s3 = dat$Q30_A_Part_3,
efs = dat$Q30_A_Part_4,
sagemaker = dat$Q31_A_Part_1,
redshift = dat$Q32_A_Part_11,
aurora = dat$Q32_A_Part_12,
rds = dat$Q32_A_Part_13,
dynamodb = dat$Q32_A_Part_14
) %>%
mutate(ec2 = if_else(is.na(ec2), 0, 1),
s3 = if_else(is.na(s3), 0, 1),
efs = if_else(is.na(efs), 0, 1),
sagemaker = if_else(is.na(sagemaker), 0, 1),
redshift = if_else(is.na(redshift), 0, 1),
aurora = if_else(is.na(aurora), 0, 1),
rds = if_else(is.na(rds), 0, 1),
dynamodb = if_else(is.na(dynamodb), 0, 1))chisq.test(aws_user$ec2, aws_user$s3)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: aws_user$ec2 and aws_user$s3
## X-squared = 1532, df = 1, p-value < 2.2e-16
chisq.test(aws_user$ec2, aws_user$efs)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: aws_user$ec2 and aws_user$efs
## X-squared = 637.99, df = 1, p-value < 2.2e-16
chisq.test(aws_user$ec2, aws_user$sagemaker)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: aws_user$ec2 and aws_user$sagemaker
## X-squared = 566.34, df = 1, p-value < 2.2e-16
What is the overall AWS usage percentage among DS practitioners? Is it the same for Google Cloud?
\(H_0: p_A=p_B\), \(H_a:p_A\not=p_B\), where \(A\) and \(B\) can be replaced by AWS, Azure, or GCP, and \(n_A\), \(n_B\) are sample size of group \(A\) and \(B\) respectively.
The test statistic (z-statistic) can be calculated as follow:
\[z=\frac{p_A-p_B}{\sqrt{p(1-p)/n_A+p(1-p)/n_B}}\]
cloud_comp <- tibble(
aws_usage = dat$Q27_A_Part_1,
azure_usage = dat$Q27_A_Part_2,
gcp_usage = dat$Q27_A_Part_3
) %>%
mutate(aws_usage = if_else(is.na(aws_usage), FALSE, TRUE),
azure_usage = if_else(is.na(azure_usage), FALSE, TRUE),
gcp_usage = if_else(is.na(gcp_usage), FALSE, TRUE))prop.test(c(sum(cloud_comp$aws_usage), sum(cloud_comp$azure_usage)),
c(nrow(cloud_comp), nrow(cloud_comp)),
alternative = "two.sided",
correct = TRUE)##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(sum(cloud_comp$aws_usage), sum(cloud_comp$azure_usage)) out of c(nrow(cloud_comp), nrow(cloud_comp))
## X-squared = 81.285, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.07606175 0.11865523
## sample estimates:
## prop 1 prop 2
## 0.2377358 0.1403774
prop.test(c(sum(cloud_comp$azure_usage), sum(cloud_comp$gcp_usage)),
c(nrow(cloud_comp), nrow(cloud_comp)),
alternative = "two.sided",
correct = TRUE)##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(sum(cloud_comp$azure_usage), sum(cloud_comp$gcp_usage)) out of c(nrow(cloud_comp), nrow(cloud_comp))
## X-squared = 1.8823, df = 1, p-value = 0.1701
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.005495458 0.031910552
## sample estimates:
## prop 1 prop 2
## 0.1403774 0.1271698
prop.test(c(sum(cloud_comp$gcp_usage), sum(cloud_comp$aws_usage)),
c(nrow(cloud_comp), nrow(cloud_comp)),
alternative = "two.sided",
correct = TRUE)##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(sum(cloud_comp$gcp_usage), sum(cloud_comp$aws_usage)) out of c(nrow(cloud_comp), nrow(cloud_comp))
## X-squared = 107.85, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.1315249 -0.0896072
## sample estimates:
## prop 1 prop 2
## 0.1271698 0.2377358
# R scripts here.See footnote 11 See GitHub repository tufte..
Since the same codings will be used in Problem 1-c, I will just use this part as a setup.